In [1]:
import sys
import logging
In [2]:
# Import the GEM-PRO class
from ssbio.pipeline.gempro import GEMPRO
In [3]:
# Printing multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Set the logging level in logger.setLevel(logging.<LEVEL_HERE>)
to specify how verbose you want the pipeline to be. Debug is most verbose.
CRITICAL
ERROR
WARNING
INFO
(default)DEBUG
In [4]:
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO) # SET YOUR LOGGING LEVEL HERE #
In [5]:
# Other logger stuff for Jupyter notebooks
handler = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M")
handler.setFormatter(formatter)
logger.handlers = [handler]
Set these three things:
ROOT_DIR
PROJECT
will be createdPROJECT
LIST_OF_GENES
A directory will be created in ROOT_DIR
with your PROJECT
name. The folders are organized like so:
ROOT_DIR
└── PROJECT
├── data # General storage for pipeline outputs
├── model # SBML and GEM-PRO models are stored here
├── genes # Per gene information
│ ├── <gene_id1> # Specific gene directory
│ │ └── protein
│ │ ├── sequences # Protein sequence files, alignments, etc.
│ │ └── structures # Protein structure files, calculations, etc.
│ └── <gene_id2>
│ └── protein
│ ├── sequences
│ └── structures
├── reactions # Per reaction information
│ └── <reaction_id1> # Specific reaction directory
│ └── complex
│ └── structures # Protein complex files
└── metabolites # Per metabolite information
└── <metabolite_id1> # Specific metabolite directory
└── chemical
└── structures # Metabolite 2D and 3D structure files
In [6]:
# SET FOLDERS AND DATA HERE
import tempfile
ROOT_DIR = tempfile.gettempdir()
PROJECT = 'genes_and_sequences_GP'
GENES_AND_SEQUENCES = {'b0870': 'MIDLRSDTVTRPSRAMLEAMMAAPVGDDVYGDDPTVNALQDYAAELSGKEAAIFLPTGTQANLVALLSHCERGEEYIVGQAAHNYLFEAGGAAVLGSIQPQPIDAAADGTLPLDKVAMKIKPDDIHFARTKLLSLENTHNGKVLPREYLKEAWEFTRERNLALHVDGARIFNAVVAYGCELKEITQYCDSFTICLSKGLGTPVGSLLVGNRDYIKRAIRWRKMTGGGMRQSGILAAAGIYALKNNVARLQEDHDNAAWMAEQLREAGADVMRQDTNMLFVRVGEENAAALGEYMKARNVLINASPIVRLVTHLDVSREQLAEVAAHWRAFLAR',
'b3041': 'MNQTLLSSFGTPFERVENALAALREGRGVMVLDDEDRENEGDMIFPAETMTVEQMALTIRHGSGIVCLCITEDRRKQLDLPMMVENNTSAYGTGFTVTIEAAEGVTTGVSAADRITTVRAAIADGAKPSDLNRPGHVFPLRAQAGGVLTRGGHTEATIDLMTLAGFKPAGVLCELTNDDGTMARAPECIEFANKHNMALVTIEDLVAYRQAHERKAS'}
PDB_FILE_TYPE = 'mmtf'
In [7]:
# Create the GEM-PRO project
my_gempro = GEMPRO(gem_name=PROJECT, root_dir=ROOT_DIR, genes_and_sequences=GENES_AND_SEQUENCES, pdb_file_type=PDB_FILE_TYPE)
In [8]:
# Mapping using BLAST
my_gempro.blast_seqs_to_pdb(all_genes=True, seq_ident_cutoff=.9, evalue=0.00001)
my_gempro.df_pdb_blast.head(2)
Out[8]:
In [9]:
# Download all mapped PDBs and gather the metadata
my_gempro.download_all_pdbs()
my_gempro.df_pdb_metadata.head(2)
Out[9]:
In [10]:
# Set representative structures
my_gempro.set_representative_structure()
my_gempro.df_representative_structures.head()
Out[10]:
In [11]:
# Looking at the information saved within a gene
my_gempro.genes.get_by_id('b0870').protein.representative_structure
my_gempro.genes.get_by_id('b0870').protein.representative_structure.get_dict()
Out[11]:
Out[11]:
For those proteins with no representative structure, we can create homology models for them. ssbio
contains some built in functions for easily running I-TASSER locally or on machines with SLURM
(ie. on NERSC) or Torque
job scheduling.
You can load in I-TASSER models once they complete using the get_itasser_models
later.
In [12]:
# Prep I-TASSER model folders
my_gempro.prep_itasser_modeling('~/software/I-TASSER4.4', '~/software/ITLIB/', runtype='local', all_genes=False)
In [13]:
import os.path as op
my_gempro.save_json(op.join(my_gempro.model_dir, '{}.json'.format(my_gempro.id)), compression=False)